--- title: `data_process` keywords: fastai sidebar: home_sidebar nb_path: "nbs/02_data_process.ipynb" ---
{% raw %}
run on collie.local
{% endraw %} {% raw %}
{% endraw %} {% raw %}
import holoviews as hv
hv.extension('bokeh')
{% endraw %} {% raw %}
dir_data = Path('../../kgl_humanprotein_data/')
dir_segmodels = Path('../../hpa-cell-segmentation-models/')

dir_hpa = dir_data/'hpa-single-cell-image-classification'
dir_trn = dir_hpa/'train'
dir_test = dir_hpa/'test'
{% endraw %}

Helpers

{% raw %}
{% endraw %} {% raw %}

imgids_from_directory[source]

imgids_from_directory(path)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
def unzip_zipped_files(src):
    '''
    Unzip all zipped files and remove the zipped version in 
    directory `src`.  Note this assumed each zip file unpacks
    to just a file with the same name except the 'zip' at the end.
    
    Args:
        src (pathlib.Path): Directory containing zipped files.  This
            will also be the directory to unpack to.
    '''
    for fn in src.glob('*.zip'):
        with zipfile.ZipFile(fn) as archive:
            archive.extractall(path=fn.parent)

        if (fn.parent/fn.stem).exists():
            fn.unlink()
{% endraw %} {% raw %}
# list(dir_test.iterdir())
{% endraw %}

Image I/O

{% raw %}
{% endraw %} {% raw %}

read_img[source]

read_img(dir_data, image_id, color, image_size=None, suffix='.png')

{% endraw %} {% raw %}

load_RGBY_image[source]

load_RGBY_image(dir_data, image_id, rgb_only=False, suffix='.png', image_size=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

save_image[source]

save_image(dst, imgid, img)

{% endraw %} {% raw %}
idx = 3
imgid = imgids_testing[idx]
img = load_RGBY_image(dir_trn, imgid)
{% endraw %} {% raw %}
hv.RGB(img[...,:3])
{% endraw %}

Cell segmentation

Install hpacellseg:

{% raw %}
 
{% endraw %} {% raw %}
{% endraw %} {% raw %}

class CellSegmentator[source]

CellSegmentator(nuc_model, cell_model, *args, **kwargs) :: CellSegmentator

Uses pretrained DPN-Unet models to segment cells from images.

{% endraw %} {% raw %}

load_segmentator[source]

load_segmentator(dir_segmentator_models, scale_factor=0.25, device='cuda', padding=True, multi_channel_model=True)

{% endraw %} {% raw %}

get_cellmask[source]

get_cellmask(img, segmentator)

{% endraw %} {% raw %}
segmentator = load_segmentator(dir_segmodels, padding=True)
No GPU found, using CPU.
please compile abn
{% endraw %} {% raw %}
def test_segment_given_filepaths():

    imgids = imgids_testing[:]

    fns_red, fns_yellow, fns_blue = (
        [dir_hpa/'train'/f'{imgid}_{color}.png' for imgid in imgids]
        for color in ('red', 'yellow', 'blue'))
    
    masks = segmentator(red=fns_red, yellow=fns_yellow, blue=fns_blue)
{% endraw %}

Masks, RLE and bboxes

{% raw %}
{% endraw %} {% raw %}

encode_binary_mask[source]

encode_binary_mask(mask)

Converts a binary mask into OID challenge encoding ascii text.

{% endraw %} {% raw %}

coco_rle_encode[source]

coco_rle_encode(bmask)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

rle_encode[source]

rle_encode(img, mask_val=1)

Turns our masks into RLE encoding to easily store them and feed them into models later on https://en.wikipedia.org/wiki/Run-length_encoding

Args: img (np.array): Segmentation array mask_val (int): Which value to use to create the RLE

Returns: RLE string

{% endraw %} {% raw %}

rle_decode[source]

rle_decode(rle_string, height, width)

Convert RLE sttring into a binary mask

Args: rle_string (rle_string): Run length encoding containing segmentation mask information height (int): Height of the original image the map comes from width (int): Width of the original image the map comes from

Returns: Numpy array of the binary segmentation mask for a given cell

{% endraw %} {% raw %}
{% endraw %} {% raw %}

mask2rles[source]

mask2rles(mask)

Args: mask (np.array): 2-D array with discrete values each representing a different class or object. rles (list): COCO run-length encoding: {'size': [height, width], 'counts': encoded RLE}

{% endraw %} {% raw %}
{% endraw %} {% raw %}

rles2bboxes[source]

rles2bboxes(rles)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

segment_image[source]

segment_image(dir_img=None, imgid=None, segmentator=None)

{% endraw %} {% raw %}

segment_images[source]

segment_images(dir_img, imgids, segmentator)

{% endraw %}

Image processing

{% raw %}
{% endraw %} {% raw %}

resize_image[source]

resize_image(img, sz)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

crop_image[source]

crop_image(img, bbox, bmask=None)

Args: img (np.array): Image to be cropped by bbox. bbox (np.array): Bounding box in terms of [x0, y0, x1, y1]. bmask (np.array, np.uint8): Binary mask for the cell.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

remove_faint_greens[source]

remove_faint_greens(xs, crops, green_thres=64)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

pad_to_square[source]

pad_to_square(img)

Pad an image to a square size, centering it as much as possible.

{% endraw %}

Saving RLE

The feather format seems convenient for saving a pd.DataFrame with complex and different data types.

{% raw %}
# for i in range(len(rles)):
#     df = df.append(
#         {'rle': rles[i]}, ignore_index=True)
    
# df.to_feather('df.feather')
{% endraw %} {% raw %}
 
{% endraw %} {% raw %}
 
{% endraw %} {% raw %}
 
{% endraw %}

Segment train set

@dschettler8845 has kindly provided the results of applying HPA's Cell Segmentator on the training images: https://www.kaggle.com/dschettler8845/starter-on-how-to-load-rle-masks.

The csv file is downloaded, renamed and placed at:

{% raw %}
{% endraw %} {% raw %}

load_seg_trn[source]

load_seg_trn(pth_csv)

Loads @dscettler8845's segmentation results for train set.

{% endraw %} {% raw %}
{% endraw %} {% raw %}

split_cells[source]

split_cells(df_seg)

Args: df_seg (pd.DataFrame): Each row is an image. df_cells (pd.DataFrame): Each row is a cell.

{% endraw %} {% raw %}
%%time

# Locate @dschettler8845's csv file containing segmentation results
dir_seg_trn = Path(DATA_DIR) / 'train' 
pth_seg_trn = dir_seg_trn / 'train.csv'

# Load it.
df_seg_trn = load_seg_trn(pth_seg_trn)
CPU times: user 35.7 s, sys: 9.83 s, total: 45.6 s
Wall time: 48.7 s
{% endraw %} {% raw %}
imgids = imgids_from_directory(dir_trn)
df_seg_trn = df_seg_trn.set_index('ID').loc[imgids].reset_index()
{% endraw %} {% raw %}
%%time
df_cells_trn = split_cells(df_seg_trn)
CPU times: user 1.39 s, sys: 396 ms, total: 1.78 s
Wall time: 1.87 s
{% endraw %} {% raw %}
21806 / len(df_seg_trn) * 1.87 / 60**2
2.265401111111111
{% endraw %}

Segment test set

{% raw %}
imgids = imgids_from_directory(dir_test)
imgids
['004a270d-34a2-4d60-bbe4-365fca868193',
 '0040581b-f1f2-4fbe-b043-b6bfea5404bb']
{% endraw %} {% raw %}
df_cells_test = segment_images(dir_test, imgids, segmentator)
100%|██████████| 2/2 [00:59<00:00, 29.73s/it]
{% endraw %} {% raw %}
df_cells_test.shape
(27, 3)
{% endraw %}

Generate cell crops

{% raw %}
{% endraw %} {% raw %}

generate_crops[source]

generate_crops(df_cells, src, dst, out_sz=768)

  • Crop out each cell from its image.
  • Resize the crop to a square and save to disk.
  • Record the crop's maximum green channel value.
{% endraw %} {% raw %}
crop_sz = 384
{% endraw %} {% raw %}
%%time 

dir_crops_trn = DATA_DIR / 'train' / f'images_{crop_sz}'
dir_crops_trn.mkdir(exist_ok=True)

df_cells_trn = generate_crops(
    df_cells_trn, dir_trn, dir_crops_trn, out_sz=crop_sz)
{% endraw %} {% raw %}
dir_data_raw = DATA_DIR / 'raw'
dir_data_raw.mkdir(exist_ok=True)

df_cells_trn.to_feather(dir_data_raw / 'train.feather')
CPU times: user 3.36 s, sys: 848 ms, total: 4.21 s
Wall time: 4.62 s
{% endraw %} {% raw %}
df_cells_trn
Id rle bbox Target max_green
0 000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_0 {'size': [2048, 2048], 'counts': b'PPTb06jo11O... [290, 0, 770, 138] 7|1|2|0 241
1 000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_1 {'size': [2048, 2048], 'counts': b'YVTZ1S1ln12... [674, 0, 1006, 666] 7|1|2|0 255
2 000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_2 {'size': [2048, 2048], 'counts': b'TP^i11ko15O... [919, 0, 1366, 270] 7|1|2|0 255
3 000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_3 {'size': [2048, 2048], 'counts': b'Z9l5Tj10000... [0, 110, 662, 638] 7|1|2|0 255
4 000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_4 {'size': [2048, 2048], 'counts': b'P\\TV2l0So1... [1122, 190, 1518, 724] 7|1|2|0 255
... ... ... ... ... ...
102 a34d8680-bb99-11e8-b2b9-ac1f6b6435d0_13 {'size': [2048, 2048], 'counts': b'cZmb03jo13L... [302, 1239, 438, 1414] 18 50
103 a34d8680-bb99-11e8-b2b9-ac1f6b6435d0_14 {'size': [2048, 2048], 'counts': b'Siil02mo12N... [460, 1306, 953, 2048] 18 34
104 a34d8680-bb99-11e8-b2b9-ac1f6b6435d0_15 {'size': [2048, 2048], 'counts': b'UdUf2n0Qo12... [1378, 1444, 2048, 2006] 18 67
105 a34d8680-bb99-11e8-b2b9-ac1f6b6435d0_16 {'size': [2048, 2048], 'counts': b'nc1R<nc1000... [0, 1662, 354, 2048] 18 38
106 a34d8680-bb99-11e8-b2b9-ac1f6b6435d0_17 {'size': [2048, 2048], 'counts': b'Pe]\\2d0[o1... [1222, 1682, 1882, 2048] 18 31

107 rows × 5 columns

{% endraw %} {% raw %}
%%time 

dir_crops_test = DATA_DIR/'test'/f'images_{crop_sz}'
dir_crops_test.mkdir(exist_ok=True)

df_cells_test = generate_crops(
    df_cells_test, dir_test, dir_crops_test, out_sz=crop_sz)

df_cells_test.to_feather(dir_data_raw/'test.feather')
CPU times: user 934 ms, sys: 225 ms, total: 1.16 s
Wall time: 1.39 s
{% endraw %} {% raw %}
# -----------------------------------------------------

# imgids = df_cell_trn['Id'].apply(lambda o: o.split('_')[0])
# iter_grpd_imgids = iter(df_cell_trn.groupby(imgids))

# imgid, df_img = next(iter_grpd_imgids)

# crops = []
# for cellid, df_cell in df_img.groupby('Id'):
#     crop = load_RGBY_image(dir_crops_trn, cellid)
#     crops.append(crop)
    
# img = load_RGBY_image(dir_trn, imgid)

# imgs = [img] + crops

# figs = [hv.RGB(img[...,:3]) for img in imgs]
# hv.Layout(figs).cols(5).opts(
#     hv.opts.RGB(yaxis=None, height=100, xaxis=None, width=100))
{% endraw %}

Generate meta

{% raw %}
 
{% endraw %} {% raw %}
{% endraw %} {% raw %}

fill_targets[source]

fill_targets(row)

{% endraw %} {% raw %}

generate_meta[source]

generate_meta(dir_mdata, fname, dataset='train')

{% endraw %} {% raw %}
meta_dir = DATA_DIR / 'meta'
meta_dir.mkdir(exist_ok=True)
{% endraw %} {% raw %}
generate_meta(meta_dir, 'train.feather', dataset='train')
{% endraw %}

This step is not done for test.feather as it's not labelled.

{% raw %}
# generate_meta(meta_dir, 'HPAv18RBGY_wodpl.csv', dataset='external')
{% endraw %}

Generate split

{% raw %}
 
{% endraw %} {% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_meta[source]

get_meta()

{% endraw %} {% raw %}
{% endraw %} {% raw %}

create_split_file[source]

create_split_file(data_set='train', name='train', num=None)

{% endraw %} {% raw %}

create_random_split[source]

create_random_split(dir_mdata, train_meta, external_meta=None, n_splits=4, alias='random')

{% endraw %} {% raw %}

load_match_info[source]

load_match_info()

{% endraw %} {% raw %}

generate_noleak_split[source]

generate_noleak_split(n_splits=5)

{% endraw %} {% raw %}
num = 8  # Testing
# num = 160  # Original

create_split_file(data_set="train", name="train", num=num)
create_split_file(data_set="train", name="valid", num=num)
create_split_file(data_set="test", name="test", num=num)

create_split_file(data_set="train", name="train", num=None)
create_split_file(data_set="test", name="test", num=None)

train_meta, _ = get_meta()
create_random_split(train_meta, n_splits=5)
# create_random_split(train_meta, external_meta=external_meta, n_splits=5, alias='random_ext')
create split file: train_8
create split file: valid_8
create split file: test_8
create split file: train_107
create split file: test_27
Nucleoplasm 15 4
Nuclear membrane 36 9
Nucleoli 15 4
Nucleoli fibrillar center 0 0
Nuclear speckles 0 0
Nuclear bodies 8 2
Endoplasmic reticulum 0 0
Golgi apparatus 15 4
Intermediate filaments 0 0
Actin filaments 0 0
Microtubules 0 0
Mitotic spindle 0 0
Centrosome 27 7
Plasma membrane 0 0
Mitochondria 0 0
Aggresome 0 0
Cytosol 0 0
Vesicles and punctate cytosolic patterns 0 0
Negative 15 3
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_train_cv0.feather, shape: (86, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_valid_cv0.feather, shape: (21, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_train_cv1.feather, shape: (85, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_valid_cv1.feather, shape: (22, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_train_cv2.feather, shape: (86, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_valid_cv2.feather, shape: (21, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_train_cv3.feather, shape: (85, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_valid_cv3.feather, shape: (22, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_train_cv4.feather, shape: (86, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_folds5/random_valid_cv4.feather, shape: (21, 25)
{% endraw %} {% raw %}
 
{% endraw %}

Generate split for samples with a single image-level label.

{% raw %}
n_target = train_meta['Target'].apply(lambda x: len(x.split('|')))
is_single_label = n_target == 1
train_meta_single_label = train_meta[is_single_label].reset_index(drop=True)
{% endraw %} {% raw %}
create_random_split(
    train_meta_single_label, n_splits=5, alias='random_ntarget1')
Nucleoplasm 0 0
Nuclear membrane 21 5
Nucleoli 0 0
Nucleoli fibrillar center 0 0
Nuclear speckles 0 0
Nuclear bodies 8 2
Endoplasmic reticulum 0 0
Golgi apparatus 0 0
Intermediate filaments 0 0
Actin filaments 0 0
Microtubules 0 0
Mitotic spindle 0 0
Centrosome 27 7
Plasma membrane 0 0
Mitochondria 0 0
Aggresome 0 0
Cytosol 0 0
Vesicles and punctate cytosolic patterns 0 0
Negative 14 4
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_train_cv0.feather, shape: (70, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_valid_cv0.feather, shape: (18, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_train_cv1.feather, shape: (71, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_valid_cv1.feather, shape: (17, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_train_cv2.feather, shape: (70, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_valid_cv2.feather, shape: (18, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_train_cv3.feather, shape: (70, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_valid_cv3.feather, shape: (18, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_train_cv4.feather, shape: (71, 25)
create split file: ../../kgl_humanprotein_data/protein/split/random_ntarget1_folds5/random_valid_cv4.feather, shape: (17, 25)
{% endraw %}

Calculate images' mean and std

{% raw %}
 
{% endraw %} {% raw %}
{% endraw %} {% raw %}

get_img_mean_std[source]

get_img_mean_std(img_dir, color, img_mean, img_std)

{% endraw %} {% raw %}
for color in ['red', 'green', 'blue', 'yellow']:
    img_mean = []
    img_std = []
    img_dir = opj(DATA_DIR, f'test/images_{crop_sz}')
    img_mean, img_std = get_img_mean_std(img_dir, color, img_mean, img_std)
    img_dir = opj(DATA_DIR, f'train/images_{crop_sz}')
    img_mean, img_std = get_img_mean_std(img_dir, color, img_mean, img_std)
    print(color, np.around(np.mean(img_mean), decimals=6), np.around(np.mean(img_std), decimals=6))
100%|██████████| 27/27 [00:00<00:00, 113.79it/s]
100%|██████████| 107/107 [00:00<00:00, 123.52it/s]
 44%|████▍     | 12/27 [00:00<00:00, 116.10it/s]
red 0.049556 0.098254
100%|██████████| 27/27 [00:00<00:00, 115.01it/s]
100%|██████████| 107/107 [00:00<00:00, 133.29it/s]
100%|██████████| 27/27 [00:00<00:00, 141.06it/s]
  0%|          | 0/107 [00:00<?, ?it/s]
green 0.026136 0.054394
100%|██████████| 107/107 [00:00<00:00, 112.30it/s]
 41%|████      | 11/27 [00:00<00:00, 106.68it/s]
blue 0.038826 0.118075
100%|██████████| 27/27 [00:00<00:00, 110.65it/s]
100%|██████████| 107/107 [00:00<00:00, 117.61it/s]
yellow 0.067102 0.120626

{% endraw %}